In [1]:
import gzip
import json
import pandas as pd
from collections import defaultdict, Counter
In [2]:
%%time
data = []
media_types = defaultdict(int)
url_types = defaultdict(int)
has_urls = 0
unique_urls = set()
with gzip.open("all_ids.txt.json.gz") as fp:
for line in fp:
d = json.loads(line.strip())
data.append(d)
if 'entities' not in d:
continue
if 'media' in d['entities']:
m_entities = d['entities']['media']
for m in m_entities:
m_type = m['type']
media_types[m_type] += 1
if 'urls' in d['entities']:
m_entities = d['entities']['urls']
if len(m_entities) > 0:
has_urls += 1
for m in m_entities:
media_types['url'] += 1
m = m['expanded_url']
m_type = m.split("/", 3)[2]
unique_urls.add((m, m_type))
url_types[m_type] += 1
print(media_types)
url_types = Counter(url_types)
print("Of {} tweets, {} contain a total of {} urls with {} unique domains and {} unique urls".format(
len(data), has_urls, media_types["url"], len(url_types), len(unique_urls)))
In [3]:
url_types.most_common(50)
Out[3]:
In [4]:
sorted(unique_urls,
key=lambda x: url_types[x[1]],
reverse=True)[:10]
Out[4]:
In [5]:
len(data)
Out[5]:
In [6]:
data[0].keys()
Out[6]:
In [7]:
data[0][u'source']
Out[7]:
In [8]:
data[0][u'is_quote_status']
Out[8]:
In [9]:
data[0][u'quoted_status']['text']
Out[9]:
In [10]:
data[0]['text']
Out[10]:
In [11]:
count_quoted = 0
has_coordinates = 0
count_replies = 0
language_ids = defaultdict(int)
count_user_locs = 0
user_locs = Counter()
count_verified = 0
for d in data:
count_quoted += d.get('is_quote_status', 0)
coords = d.get(u'coordinates', None)
repl_id = d.get(u'in_reply_to_status_id', None)
has_coordinates += (coords is not None)
count_replies += (repl_id is not None)
loc = d['user'].get('location', u'')
count_verified += d['user']['verified']
if loc != u'':
count_user_locs += 1
user_locs.update([loc])
language_ids[d['lang']] += 1
print count_quoted, has_coordinates, count_replies, count_user_locs, count_verified
print("Of {} tweets, {} have coordinates, while {} have user locations, comprising of {} unique locations".format(
len(data), has_coordinates, count_user_locs, len(user_locs)
))
In [12]:
user_locs.most_common(10)
Out[12]:
In [13]:
len(data)
Out[13]:
In [14]:
data[0]['user']
Out[14]:
In [15]:
df = pd.read_csv("URL_CAT_MAPPINGS.txt", sep="\t")
df.head()
Out[15]:
In [16]:
df['URL_EXP_SUCCESS'] = (df.EXPANDED_STATUS < 2)
df.head()
Out[16]:
In [17]:
URL_DICT = dict(zip(df[df.URL_CATS != 'UNK'].URL, df[df.URL_CATS != 'UNK'].URL_CATS))
URL_MAPS = dict(zip(df.URL, df.URL_DOMAIN))
URL_EXP_SUCCESS = dict(zip(df.URL, df.URL_EXP_SUCCESS))
len(URL_DICT), df.shape, len(URL_MAPS), len(URL_EXP_SUCCESS)
Out[17]:
In [18]:
df.URL.head().values
Out[18]:
In [19]:
URL_MAPS['http://bit.ly/1SqTn5d']
Out[19]:
In [20]:
found_urls = 0
twitter_urls = 0
total_urls = 0
tid_mapped_urls = []
url_types = defaultdict(int)
for d in data:
if 'urls' in d['entities']:
m_entities = d['entities']['urls']
for m in m_entities:
total_urls += 1
m = m['expanded_url']
m_cats = "UNK"
if m in URL_DICT:
found_urls += 1
m_cats = URL_DICT[m]
elif m.startswith("https://twitter.com") or m.startswith("http://twitter.com"):
found_urls += 1
twitter_urls += 1
m_cats = "socialmedia|twitter"
else:
m_type = "failed_url"
if URL_EXP_SUCCESS[m]:
m_type = URL_MAPS.get(m, "None.com")
"""
m_type = m.split("/", 3)[2]
#m_type = m_type.split("/", 3)[2]
if m_type.startswith("www."):
m_type = m_type[4:]
"""
url_types[m_type] += 1
tid_mapped_urls.append((d["id"], m, m_cats))
print "Data: %s, Total: %s, Found: %s, Twitter: %s" % (len(data), total_urls, found_urls, twitter_urls)
url_types = Counter(url_types)
url_types.most_common(10)
Out[20]:
In [21]:
url_types.most_common(50)
Out[21]:
In [22]:
sum(url_types.values())
Out[22]:
In [23]:
tid_mapped_urls[:10]
Out[23]:
In [24]:
df_mapped_cats = pd.DataFrame(tid_mapped_urls, columns=["TID", "URL", "CATS"])
df_mapped_cats.head()
Out[24]:
In [25]:
df_mapped_cats.to_csv("TID_URL_CATS.txt", sep="\t", index=False)
! head TID_URL_CATS.txt
In [26]:
def extract_meta_features(x):
u_data = x["user"]
u_url = u_data['url']
if u_url is not None:
u_url = u_data['entities']['url']['urls'][0]['expanded_url']
return (x["id"],
x['created_at'],
x['retweet_count'],
x['favorite_count'],
x['in_reply_to_status_id'] is not None,
'quoted_status' in x and x['quoted_status'] is not None,
len(x['entities']['hashtags']),
len(x['entities']['urls']),
len(x['entities']['user_mentions']),
0 if 'media' not in x['entities'] else len(x['entities']['media']), # Has photos
u_data['id'],
u_data[u'created_at'],
u_data[u'listed_count'],
u_data[u'favourites_count'],
u_data[u'followers_count'],
u_data[u'friends_count'],
u_data[u'statuses_count'],
u_data[u'verified'],
u_data[u'location'].replace('\r', ''),
u_data[u'name'].replace('\r',''),
u_url
)
In [27]:
extract_meta_features(data[0])
Out[27]:
In [28]:
df_meta = pd.DataFrame((extract_meta_features(d) for d in data),
columns=["t_id", "t_created", "t_retweets",
"t_favorites", "t_is_reply", "t_is_quote",
"t_n_hashtags", "t_n_urls", "t_n_mentions",
"t_n_media",
"u_id", "u_created",
"u_n_listed", "u_n_favorites", "u_n_followers",
"u_n_friends", "u_n_statuses",
"u_is_verified", "u_location", "u_name", "u_url"
])
df_meta.head()
Out[28]:
In [29]:
df_meta.dtypes
Out[29]:
In [30]:
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].head()
Out[30]:
In [31]:
df_meta.to_csv("TID_META.txt", sep="\t", index=False, encoding='utf-8')
! head TID_META.txt
In [32]:
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].shape
Out[32]:
In [33]:
df_meta.shape
Out[33]:
In [ ]: